library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(stringr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(huxtable)
##
## Attaching package: 'huxtable'
## The following object is masked from 'package:dplyr':
##
## add_rownames
## The following object is masked from 'package:ggplot2':
##
## theme_grey
library(broom)
wahlkreise_shp <- st_read(my_path_wahlkreise)
## Reading layer `Geometrie_Wahlkreise_20DBT' from data source
## `/Users/duzhiting/Documents/GitHub/R-Prediction/btw21_geometrie_wahlkreise_shp/Geometrie_Wahlkreise_20DBT.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 299 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 280371.1 ymin: 5235856 xmax: 921120.1 ymax: 6101444
## Projected CRS: ETRS89 / UTM zone 32N
glimpse(wahlkreise_shp)
## Rows: 299
## Columns: 5
## $ WKR_NR <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ WKR_NAME <chr> "Flensburg – Schleswig", "Nordfriesland – Dithmarschen Nord"…
## $ LAND_NR <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "01", …
## $ LAND_NAME <chr> "Schleswig-Holstein", "Schleswig-Holstein", "Schleswig-Holst…
## $ geometry <MULTIPOLYGON [m]> MULTIPOLYGON (((545529.8 60..., MULTIPOLYGON ((…
wahlkreise_shp %>%
ggplot() +
geom_sf()
wahlkreise_shp %>%
ggplot() +
geom_sf(fill = "grey40") +
theme_void()
unemp_file <- "~/Documents/Github/R-Prediction/btw21_Strukturdaten.csv"
file.exists(unemp_file)
## [1] TRUE
unemp_de_raw <- read_delim(unemp_file,
";", escape_double = FALSE,
locale = locale(decimal_mark = ",",
grouping_mark = "."),
trim_ws = TRUE,
skip = 8) # skipt the first 8 rows
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Land = col_character(),
## `Wahlkreis-Nr.` = col_character(),
## `Wahlkreis-Name` = col_character(),
## `Fläche am 31.12.2019 (km²)` = col_number(),
## Fußnoten = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
we need to do some cleansing before we can work with this dataset.
unemp_names <- names(unemp_de_raw)
unemp_de <- unemp_de_raw
names(unemp_de) <- paste0("V",1:ncol(unemp_de))
The important columns are:
The important columns are:
unemp_de <- unemp_de %>%
rename(state = V1,
area_nr = V2,
area_name = V3,
for_prop = V8,
pop_move = V11,
pop_migr_background = V19,
income = V26,
unemp = V47)
elec_results = read.csv2("~/Documents/Github/R-Prediction/kerg.csv", head = TRUE, sep="\t")
head(elec_results)
| Nr | Gebiet | Waehler_gueltige_Zweitstimmen_vorlauefig | AfD3 |
|---|---|---|---|
| 1 | Flensburg – Schleswig | 180112 | 10317 |
| 2 | Nordfriesland – Dithmarschen Nord | 145387 | 8798 |
| 3 | Steinburg – Dithmarschen Süd | 136299 | 11303 |
| 4 | Rendsburg-Eckernförde | 162060 | 10564 |
| 5 | Kiel | 155986 | 7654 |
| 6 | Plön – Neumünster | 133721 | 9741 |
glimpse(wahlkreise_shp)
## Rows: 299
## Columns: 5
## $ WKR_NR <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 1…
## $ WKR_NAME <chr> "Flensburg – Schleswig", "Nordfriesland – Dithmarschen Nord"…
## $ LAND_NR <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "01", …
## $ LAND_NAME <chr> "Schleswig-Holstein", "Schleswig-Holstein", "Schleswig-Holst…
## $ geometry <MULTIPOLYGON [m]> MULTIPOLYGON (((545529.8 60..., MULTIPOLYGON ((…
For each party, four values are reported:
primary vote, present election primary vote, previous election secondary vote, present election secondary vote, previous election
afd_prop <- elec_results %>%
rename(afd_votes = AfD3,
area_nr = Nr,
area_name = Gebiet,
votes_total = Waehler_gueltige_Zweitstimmen_vorlauefig) %>%
mutate(afd_prop = afd_votes / votes_total) %>%
na.omit
unemp_de$area_nr <- as.integer(unemp_de$area_nr)
In the previous step, we have selected the columns of interest, changed their name (shorter, English), and have computed the proportion of (valid) secondary votes in favor of the AfD.
wahlkreise_shp %>%
left_join(unemp_de, by = c("WKR_NR" = "area_nr")) %>%
left_join(afd_prop, by = "area_name") -> chloro_data
view(unemp_de)
view(wahlkreise_shp)
view(chloro_data)
chloro_data %>%
ggplot() +
geom_sf(aes(fill = afd_prop)) -> p1
p1
p1 + scale_fill_distiller(palette = "Spectral") +
theme_void()
chloro_data %>%
ggplot() +
geom_sf(aes(fill = unemp)) +
scale_fill_distiller(palette = "Spectral") +
theme_void() -> p2
p2
Let’s compute the percent ranking for each of the variables of interest (AfD votes, unemployment ratio, and income). Then we can compute the concordance for each pair by simply computing the difference (or maybe absolute difference). After that, we plot this “concordance variables” as fill color to the map.
chloro_data %>%
mutate(afd_rank = percent_rank(afd_prop),
unemp_rank = percent_rank(unemp),
income_rank = percent_rank(income)) %>%
mutate(afd_income_diff = subtract(afd_rank, income_rank),
afd_unemp_diff = subtract(afd_rank, unemp_rank)) -> chloro_data
Let’s check the first ranks for each of the variables of interest. AfD ranks first:
chloro_data %>%
as.data.frame %>%
select(area_name, afd_rank, afd_prop, unemp_rank, income_rank) %>%
arrange(-afd_rank) %>%
slice(1:5)
| area_name | afd_rank | afd_prop | unemp_rank | income_rank |
|---|---|---|---|---|
| Görlitz | 1 | 0.321 | 0.856 | 0.228 |
| Sächsische Schweiz-Osterzgebirge | 0.997 | 0.315 | 0.419 | 0.601 |
| Bautzen I | 0.993 | 0.314 | 0.433 | 0.255 |
| Erzgebirgskreis I | 0.99 | 0.302 | 0.362 | 0.617 |
| Mittelsachsen | 0.986 | 0.297 | 0.413 | 0.403 |
Goerlitz leads. Unemployment “top” places:
chloro_data %>%
as.data.frame %>%
select(area_name, afd_prop, unemp_rank, income_rank) %>%
arrange(-unemp_rank) %>%
slice(1:5)
| area_name | afd_prop | unemp_rank | income_rank |
|---|---|---|---|
| Gelsenkirchen | 0.126 | 1 | 0.0101 |
| Duisburg I | 0.0826 | 0.993 | 0.0134 |
| Duisburg II | 0.12 | 0.993 | 0.0134 |
| Bremen II – Bremerhaven | 0.0884 | 0.983 | 0.292 |
| Dortmund I | 0.069 | 0.983 | 0.121 |
Gelsenkirchen is ahead of this sad pack. And the lowest unemployment ranks are at:
chloro_data %>%
as.data.frame %>%
select(area_name, afd_prop, unemp_rank, income_rank) %>%
arrange(unemp_rank) %>%
slice(1:5)
| area_name | afd_prop | unemp_rank | income_rank |
|---|---|---|---|
| Donau-Ries | 0.109 | 0 | 0.651 |
| Erding – Ebersberg | 0.0726 | 0.00336 | 0.862 |
| Biberach | 0.107 | 0.00336 | 0.466 |
| Roth | 0.0846 | 0.0101 | 0.748 |
| Mittelems | 0.0512 | 0.0134 | 0.305 |
And finale income, low 5 and top 5:
chloro_data %>%
as.data.frame %>%
select(area_name, afd_prop, unemp_rank, income_rank) %>%
arrange(income_rank) %>%
slice(c(1:5, 294:299))
| area_name | afd_prop | unemp_rank | income_rank |
|---|---|---|---|
| Helmstedt – Wolfsburg | 0.0937 | 0.48 | 0 |
| Gifhorn – Peine | 0.0963 | 0.332 | 0.00336 |
| Salzgitter – Wolfenbüttel | 0.0979 | 0.718 | 0.00671 |
| Gelsenkirchen | 0.126 | 1 | 0.0101 |
| Duisburg I | 0.0826 | 0.993 | 0.0134 |
| München-West/Mitte | 0.0424 | 0.362 | 0.973 |
| Düsseldorf I | 0.0428 | 0.819 | 0.987 |
| Düsseldorf II | 0.0598 | 0.819 | 0.987 |
| Starnberg – Landsberg am Lech | 0.0611 | 0.0772 | 0.993 |
| Bad Tölz-Wolfratshausen – Miesbach | 0.0796 | 0.057 | 0.997 |
| München-Land | 0.0526 | 0.0436 | 1 |
chloro_data %>%
ggplot() +
geom_sf(aes(fill = afd_unemp_diff)) +
scale_fill_gradient2() +
theme_void() -> p3
p3
The fill color denotes the difference between unemployment rank of a given area and its afd vote rank. For example, if area X has an unemployment rank of .5 (50%), it means that half of the areas in the country have a lower (higher) unemployment ratio, respectively (the median). Similarly, an AfD vote rank of .5 indicates the median position. The difference of these two figures is zero, indicating accordance or close match. Thus, figures around zero denote accordance or match. 1 (100%) of AfD vote rank indicates the area with the best AfD results (area with the most votes); similar reasoning applies for income and unemployment ratio.
Hence, numbers greater than zero indicate that the AfD scored better than it would be expected by the accordance-hypothesis.
Similarly, numbers smaller than zero indicate that the AfD scored better than it would be expected by the accordance-hypothesis.
Areas with (near) white filling provide some support for the accordance hypothesis. There are areas of this type, but it is not the majority. The vast majority of areas showed too much or too little AfD - relative to their unemployment ratio.
This reasonsing shows that the AfD received better results in southern and middle areas of Germany than it would be expected by the accordance hypothesis. In contrast, the more poorer northern areas voted for the AfD much less often than it would be expected by the accordance hypothesis.
Let’s look at the areas with minimal and maximal dis-accordance, out of curiosity.
chloro_data %>%
as.data.frame %>%
select(area_name, afd_unemp_diff, unemp, afd_prop) %>%
arrange(afd_unemp_diff) %>%
slice(c(1:5, 295:299)) %>% hux %>%
add_colnames
| area_name | afd_unemp_diff | unemp | afd_prop |
|---|---|---|---|
| area_name | afd_unemp_diff | unemp | afd_prop |
| Berlin-Friedrichshain-Kreuzberg – Prenzlauer Berg Ost | -0.903 | 10.6 | 0.0405 |
| Köln II | -0.886 | 9.8 | 0.0287 |
| Bremen I | -0.875 | 11.2 | 0.0528 |
| Essen III | -0.868 | 11.5 | 0.0549 |
| Berlin-Charlottenburg-Wilmersdorf | -0.862 | 10.6 | 0.0473 |
| Schwäbisch Hall – Hohenlohe | 0.75 | 3.6 | 0.126 |
| Rottweil – Tuttlingen | 0.754 | 3.7 | 0.132 |
| Neu-Ulm | 0.76 | 3.2 | 0.118 |
| Höxter – Gütersloh III – Lippe II | 4.9 | ||
| Paderborn | 5.9 |
chloro_data %>%
as.data.frame %>%
select(area_name, afd_unemp_diff, unemp, afd_prop) %>%
arrange(afd_unemp_diff) %>%
filter(afd_unemp_diff > -0.05, afd_unemp_diff < .05) %>%
hux %>%
add_colnames
| area_name | afd_unemp_diff | unemp | afd_prop |
|---|---|---|---|
| area_name | afd_unemp_diff | unemp | afd_prop |
| Rotenburg I – Heidekreis | -0.0444 | 5.7 | 0.0795 |
| Halle | -0.0413 | 9.5 | 0.147 |
| Groß-Gerau | -0.0337 | 6.1 | 0.087 |
| Augsburg-Stadt | -0.0299 | 6.4 | 0.0906 |
| Magdeburg | -0.0245 | 9.2 | 0.15 |
| Segeberg – Stormarn-Mitte | -0.0215 | 5 | 0.0728 |
| Herzogtum Lauenburg – Stormarn-Süd | -0.0146 | 5.1 | 0.0746 |
| Celle – Uelzen | -0.00964 | 6.4 | 0.0913 |
| Harburg | -0.00823 | 4.8 | 0.0722 |
| Ludwigshafen/Frankenthal | 0.00512 | 8.1 | 0.117 |
| Nienburg II – Schaumburg | 0.00662 | 5.9 | 0.087 |
| Coesfeld – Steinfurt II | 0.00696 | 3.4 | 0.0456 |
| St. Wendel | 0.0173 | 6.3 | 0.0923 |
| Kreuznach | 0.0177 | 6.7 | 0.0978 |
| Hochsauerlandkreis | 0.0222 | 4.8 | 0.0737 |
| Hochtaunus | 0.0225 | 5.1 | 0.0775 |
| Mecklenburgische Seenplatte I – Vorpommern-Greifswald II | 0.0264 | 9.9 | 0.228 |
| Leipzig I | 0.0393 | 8.3 | 0.154 |
| Siegen-Wittgenstein | 0.0439 | 6 | 0.0907 |
| München-Land | 0.0476 | 3.5 | 0.0526 |
| Altmark | 0.0496 | 8.7 | 0.189 |
Similar story for income.
chloro_data %>%
ggplot() +
geom_sf(aes(fill = afd_income_diff)) +
scale_fill_gradient2() +
theme_void() -> p4
p4
The map shows a clear pattern: The eastern parts of Germany are far more afd-oriented than their income rank would predict (diff scores above zero, blue color). However, for some areas across the whole rest of the country, the likewise pattern is true too: A lot areas are rich and do not vote for the AfD (reddish color, diff score below zero). And, thirdly, a lot of aras support the accordance hypothesis (white color, diff score around zero).
Maybe we should simplify the map: Let’s only distinguish three type of areas: too much AfD in comparison to the unemployment, too few AfD for the unemployment, or AfD at par with unemployment. Maybe the picture is more clearcut then.
chloro_data %>%
select(afd_unemp_diff) %>%
mutate(afd_unemp_diff_3g = cut_interval(afd_unemp_diff, n = 3,
labels = c("AFD < Arbeitslosigkeit",
"AFD = Arbeitslosigkeit",
"AFD > Arbeitslosigkeit"))) %>%
ggplot() +
geom_sf(aes(fill = afd_unemp_diff_3g)) +
labs(fill) +
theme_void()
In a similar vein, we could compute the ratio of AfD votes and unemployment. That would give us some measure of covariability. Let’s see
library(viridis)
## Loading required package: viridisLite
chloro_data %>%
mutate(afd_dens = afd_prop / unemp) %>%
ggplot +
geom_sf(aes(fill = afd_dens)) +
theme_void() +
scale_fill_viridis()
The diagram shows that in relation to unemployment, the AfD votes are strongest in Sachsen. Don’t forget that this measure is an indication of co-occurence, not of absolute AfD votes.
A simple, straight-forward and well-known approach to devise assocation strength is Pearson’s correlation coefficient. Oldie but goldie. Let’s depict it.
chloro_data %>%
select(unemp, afd_prop, income, area_name) %>%
ggplot +
aes(x = unemp, y = afd_prop) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
chloro_data %>%
select(unemp, afd_prop, income, area_name) %>%
as.data.frame %T>%
summarise(cor_afd_unemp = cor(afd_prop, unemp)) %>%
do(tidy(cor.test(.$afd_prop, .$unemp)))
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 0.0356 | 0.612 | 0.541 | 295 | -0.0785 | 0.149 | Pearson's product-moment correlation | two.sided |
Let’s predict the AfD vote score taking the unemployment as an predictor. Then let’s plot the residuals to see how good the prediction is, ie., how close (or rather, far) the association of unemployment and AfD voting is.
chloro_data$afd_prop[is.na(chloro_data$afd_prop)] <- 0
view(chloro_data$afd_prop)
lm1 <- lm(afd_prop ~ unemp, data = chloro_data)
chloro_data %>%
mutate(afd_lm1 = lm(afd_prop ~ unemp, data = chloro_data)$residuals) -> chloro_data
chloro_data %>%
select(afd_lm1) %>%
ggplot() +
geom_sf(aes(fill = afd_lm1)) +
scale_fill_gradient2() +
theme_void()
This model shows a clearcut picture: The eastern part is too “afd” for its unemployment ratio (some parts of east-southern Bavaria too); the west is less afd-ic than what would be expected by the unemployment. The rest (middle and south) parts over-and-above show the AfD levels that woul be expected by their unemployment.